{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "#作者：1621430024\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "from sklearn.preprocessing import StandardScaler\n",
    "from sklearn.decomposition import PCA\n",
    "import lightgbm as lgb"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Load data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_data = pd.read_csv('../input/sf-crime/train.csv.zip', parse_dates=['Dates'])\n",
    "test_data = pd.read_csv('../input/sf-crime/test.csv.zip', parse_dates=['Dates'])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Data info"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "RangeIndex: 878049 entries, 0 to 878048\n",
      "Data columns (total 9 columns):\n",
      "Dates         878049 non-null datetime64[ns]\n",
      "Category      878049 non-null object\n",
      "Descript      878049 non-null object\n",
      "DayOfWeek     878049 non-null object\n",
      "PdDistrict    878049 non-null object\n",
      "Resolution    878049 non-null object\n",
      "Address       878049 non-null object\n",
      "X             878049 non-null float64\n",
      "Y             878049 non-null float64\n",
      "dtypes: datetime64[ns](1), float64(2), object(6)\n",
      "memory usage: 60.3+ MB\n",
      "<class 'pandas.core.frame.DataFrame'>\n",
      "RangeIndex: 884262 entries, 0 to 884261\n",
      "Data columns (total 7 columns):\n",
      "Id            884262 non-null int64\n",
      "Dates         884262 non-null datetime64[ns]\n",
      "DayOfWeek     884262 non-null object\n",
      "PdDistrict    884262 non-null object\n",
      "Address       884262 non-null object\n",
      "X             884262 non-null float64\n",
      "Y             884262 non-null float64\n",
      "dtypes: datetime64[ns](1), float64(2), int64(1), object(3)\n",
      "memory usage: 47.2+ MB\n"
     ]
    }
   ],
   "source": [
    "train_data.info()\n",
    "test_data.info()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Transform data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "all_features = pd.concat((train_data.iloc[:, [0, 3, 4, 6, 7, 8]],\n",
    "                          test_data.iloc[:, [1, 2, 3, 4, 5, 6]]),\n",
    "                         sort=False)\n",
    "\n",
    "num_train = train_data.shape[0]\n",
    "\n",
    "train_labels = pd.get_dummies(train_data['Category']).values\n",
    "num_outputs = train_labels.shape[1]\n",
    "train_labels = np.argmax(train_labels, axis=1)\n",
    "\n",
    "all_features['year'] = all_features.Dates.dt.year\n",
    "all_features['month'] = all_features.Dates.dt.month\n",
    "all_features['new_year'] = all_features['month'].apply(\n",
    "    lambda x: 1 if x == 1 or x == 2 else 0)\n",
    "all_features['day'] = all_features.Dates.dt.day\n",
    "all_features['hour'] = all_features.Dates.dt.hour\n",
    "all_features['evening'] = all_features['hour'].apply(lambda x: 1\n",
    "                                                     if x >= 18 else 0)\n",
    "\n",
    "wkm = {\n",
    "    'Monday': 0,\n",
    "    'Tuesday': 1,\n",
    "    'Wednesday': 2,\n",
    "    'Thursday': 3,\n",
    "    'Friday': 4,\n",
    "    'Saturday': 5,\n",
    "    'Sunday': 6\n",
    "}\n",
    "all_features['DayOfWeek'] = all_features['DayOfWeek'].apply(lambda x: wkm[x])\n",
    "all_features['weekend'] = all_features['DayOfWeek'].apply(\n",
    "    lambda x: 1 if x == 4 or x == 5 else 0)\n",
    "\n",
    "OneHot_features = pd.get_dummies(all_features['PdDistrict'])\n",
    "\n",
    "all_features['block'] = all_features['Address'].apply(\n",
    "    lambda x: 1 if 'block' in x.lower() else 0)\n",
    "\n",
    "PCA_features = all_features[['X', 'Y']].values\n",
    "Standard_features = all_features[['DayOfWeek', 'year', 'month', 'day',\n",
    "                                  'hour']].values\n",
    "OneHot_features = pd.concat([\n",
    "    OneHot_features, all_features[['new_year', 'evening', 'weekend', 'block']]\n",
    "],\n",
    "                            axis=1).values\n",
    "\n",
    "scaler = StandardScaler()\n",
    "scaler.fit(Standard_features)\n",
    "Standard_features = scaler.transform(Standard_features)\n",
    "\n",
    "pca = PCA(n_components=2)\n",
    "pca.fit(PCA_features)\n",
    "PCA_features = pca.transform(PCA_features)\n",
    "\n",
    "all_features = np.concatenate(\n",
    "    (PCA_features, Standard_features, OneHot_features), axis=1)\n",
    "\n",
    "train_features = all_features[:num_train]\n",
    "num_inputs = train_features.shape[1]\n",
    "test_features = all_features[num_train:]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "data_train = lgb.Dataset(train_features, label = train_labels)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Start up!"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "params = {\n",
    "    'boosting': 'gbdt', \n",
    "    'objective': 'multiclass',\n",
    "    'metrics' : 'multi_logloss',\n",
    "    'num_class': num_outputs,\n",
    "    'verbosity': 1,\n",
    "    'device_type':'gpu',\n",
    "    'gpu_platform_id':0,\n",
    "    'gpu_device_id':0,\n",
    "    'max_depth': 6,\n",
    "    'num_leaves': 51,\n",
    "    'min_data_in_leaf' : 25,\n",
    "    'feature_fraction': 0.79,\n",
    "    'learning_rate': 0.01,\n",
    "    }\n",
    "gbm = lgb.train(params, data_train, num_boost_round = 2000)\n",
    "gbm.save_model('../working/gbm(v2).txt')\n",
    "testResult = gbm.predict(test_features)\n",
    "sampleSubmission = pd.read_csv('../input/sf-crime/sampleSubmission.csv.zip')\n",
    "Result_pd = pd.DataFrame(testResult,\n",
    "                         index=sampleSubmission.index,\n",
    "                         columns=sampleSubmission.columns[1:])\n",
    "Result_pd.to_csv('../working/sampleSubmission(gbmv2).csv', index_label='Id')"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.10"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}
